/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#ifndef IMPL_REDUCE_REDUCE_COMMON_AR_RA_REUSE_UNALIGN_C310_IMPL_H
#define IMPL_REDUCE_REDUCE_COMMON_AR_RA_REUSE_UNALIGN_C310_IMPL_H

#include "kernel_operator_intf.h"
#include "kernel_tensor.h"
#include "reduce_common_util_impl.h"
#include "reduce_common_util_c310_impl.h"

namespace AscendC {

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc>
__simd_callee__ inline void ReduceRAReuseSourceUnAlignedFoldOne(__ubuf__ T *dstAddr, uint64_t src, uint16_t loopANum,
    uint32_t dimA, uint16_t vlSize, uint32_t dtypeSize, MicroAPI::MaskReg maskMain, MicroAPI::MaskReg maskTail,
    uint32_t postUpdateStrideMain, uint32_t postUpdateStrideTail)
{
    uint64_t newSrc;
    MicroAPI::UnalignReg uSrc;
    MicroAPI::UnalignReg uDst;
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
        newSrc = src + static_cast<uint64_t>(loopA * vlSize * dtypeSize);
        // L0
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        // L1
        Binaryfunc(vreg0, vreg0, vreg1, maskMain);
        DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideMain);
    }
    newSrc = src + static_cast<uint64_t>((loopANum - 1) * vlSize * dtypeSize);
    // L0
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
    MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), dimA);
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
    MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), dimA);
    // L1
    Binaryfunc(vreg0, vreg0, vreg1, maskTail);
    DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideTail);
    MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc>
__simd_callee__ inline void ReduceRAReuseSourceUnAlignedFoldTwo(__ubuf__ T *dstAddr, uint64_t src, uint16_t loopANum,
    uint32_t dimA, uint16_t vlSize, uint32_t dtypeSize, MicroAPI::MaskReg maskMain, MicroAPI::MaskReg maskTail,
    uint32_t postUpdateStrideMain, uint32_t postUpdateStrideTail)
{
    uint64_t newSrc;
    MicroAPI::UnalignReg uSrc;
    MicroAPI::UnalignReg uDst;
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::RegTensor<T, Trait> vreg2;
    MicroAPI::RegTensor<T, Trait> vreg3;
    for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
        newSrc = src + static_cast<uint64_t>(loopA * vlSize * dtypeSize);
        // L0
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        // L1
        Binaryfunc(vreg0, vreg0, vreg2, maskMain);
        Binaryfunc(vreg1, vreg1, vreg3, maskMain);
        // L2
        Binaryfunc(vreg0, vreg0, vreg1, maskMain);
        DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideMain);
    }
    newSrc = src + static_cast<uint64_t>((loopANum - 1) * vlSize * dtypeSize);
    // L0
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
    MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), dimA);
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
    MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), dimA);
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
    MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), dimA);
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
    MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), dimA);
    // L1
    Binaryfunc(vreg0, vreg0, vreg2, maskTail);
    Binaryfunc(vreg1, vreg1, vreg3, maskTail);
    // L2
    Binaryfunc(vreg0, vreg0, vreg1, maskTail);
    DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideTail);
    MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
}
template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc>
__simd_vf__ inline void ReduceRAReuseSourceUnAlignedB64VF(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR, uint32_t mainR, uint16_t folds, uint16_t avgFolds, uint16_t foldZero, uint16_t foldOne,
    uint16_t foldTwo, uint32_t postUpdateStrideTail)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint32_t tailR = dimR - mainR;
    uint16_t needInplaceAdd = tailR > 0 ? 1 : 0;
    uint16_t base = mainR;
    uint16_t mainTimes = folds / avgFolds;

    // Process vlSize axisA each time
    uint16_t loopANum = (dimA + vlSize - 1) / vlSize;
    uint32_t dtypeSize = sizeof(T);
    uint32_t postUpdateStrideMain = vlSize;
    uint32_t tailA = postUpdateStrideTail;

    MicroAPI::RegTensor<T, Trait> vregMain;
    MicroAPI::RegTensor<T, Trait> vregTail;
    MicroAPI::UnalignReg vUregMain;
    MicroAPI::UnalignReg vUregTail;
    MicroAPI::UnalignReg uDst;
    MicroAPI::MaskReg maskMain = MicroAPI::CreateMask<T, MicroAPI::MaskPattern::ALL, Trait>();
    MicroAPI::MaskReg maskTail = MicroAPI::UpdateMask<T, Trait>(tailA);

    for (uint16_t i = 0; i < needInplaceAdd; i++) {
        for (uint16_t loopR = 0; loopR < static_cast<uint16_t>(tailR); loopR++) {
            uint64_t hoistMainAddr = (uint64_t)srcAddr + static_cast<uint64_t>(loopR * dimA * dtypeSize);
            uint64_t hoistTailAddr = hoistMainAddr + static_cast<uint64_t>(mainR * dimA * dtypeSize);
            uint64_t hoistMainReuseAddr = hoistMainAddr;
            MicroAPI::DataCopyUnAlignPre(vUregMain, ((__ubuf__ T *&)hoistMainAddr));
            MicroAPI::DataCopyUnAlignPre(vUregTail, ((__ubuf__ T *&)hoistTailAddr));
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
                MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr),
                    postUpdateStrideMain);
                MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr),
                    postUpdateStrideMain);
                Binaryfunc(vregMain, vregMain, vregTail, maskMain);
                MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst,
                    postUpdateStrideMain);
            }
            MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr), postUpdateStrideTail);
            MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr), postUpdateStrideTail);
            Binaryfunc(vregMain, vregMain, vregTail, maskTail);
            MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst, postUpdateStrideTail);
            MicroAPI::DataCopyUnAlignPost(((__ubuf__ T *&)hoistMainReuseAddr), uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // MainFolds need 8*2 register
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::RegTensor<T, Trait> vreg2;
    MicroAPI::RegTensor<T, Trait> vreg3;
    MicroAPI::RegTensor<T, Trait> vreg4;
    MicroAPI::RegTensor<T, Trait> vreg5;
    MicroAPI::RegTensor<T, Trait> vreg6;
    MicroAPI::RegTensor<T, Trait> vreg7;
    MicroAPI::UnalignReg uSrc;

    // Process main folds
    uint16_t loopRNum = base;
    uint64_t src = 0;
    uint64_t dst = 0;
    uint64_t newSrc = 0;
    uint64_t newSrcTail = 0;
    for (uint16_t loopMain = 0; loopMain < mainTimes; loopMain++) {
        loopRNum = loopRNum >> avgFolds;
        uint16_t offsetR = loopRNum * dimA;
        for (uint16_t loopR = 0; loopR < loopRNum; loopR++) {
            src = (uint64_t)srcAddr + static_cast<uint64_t>(loopR * dimA * dtypeSize);
            newSrcTail = src + static_cast<uint64_t>((loopANum - 1) * vlSize * dtypeSize);
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
                newSrc = src + static_cast<uint64_t>(loopA * vlSize * dtypeSize);
                dst = newSrc;
                // L0
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                // L1
                Binaryfunc(vreg0, vreg0, vreg4, maskMain);
                Binaryfunc(vreg1, vreg1, vreg5, maskMain);
                Binaryfunc(vreg2, vreg2, vreg6, maskMain);
                Binaryfunc(vreg3, vreg3, vreg7, maskMain);
                // L2
                Binaryfunc(vreg0, vreg0, vreg2, maskMain);
                Binaryfunc(vreg1, vreg1, vreg3, maskMain);
                // L3
                Binaryfunc(vreg0, vreg0, vreg1, maskMain);
                DataCopyUnAlign((__ubuf__ T *&)dst, vreg0, uDst, postUpdateStrideMain);
            }
            dst = newSrcTail;
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            // L1
            Binaryfunc(vreg0, vreg0, vreg4, maskTail);
            Binaryfunc(vreg1, vreg1, vreg5, maskTail);
            Binaryfunc(vreg2, vreg2, vreg6, maskTail);
            Binaryfunc(vreg3, vreg3, vreg7, maskTail);
            // L2
            Binaryfunc(vreg0, vreg0, vreg2, maskTail);
            Binaryfunc(vreg1, vreg1, vreg3, maskTail);
            // L3
            Binaryfunc(vreg0, vreg0, vreg1, maskTail);
            DataCopyUnAlign((__ubuf__ T *&)dst, vreg0, uDst, postUpdateStrideTail);
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dst, uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // Process tail folds
    src = (uint64_t)srcAddr;
    for (uint16_t i = 0; i < foldOne; i++) {
        ReduceRAReuseSourceUnAlignedFoldOne<T, Trait, Binaryfunc>(dstAddr, src, loopANum, dimA, vlSize, dtypeSize, maskMain,
            maskTail, postUpdateStrideMain, postUpdateStrideTail);
    }

    for (uint16_t i = 0; i < foldTwo; i++) {
        ReduceRAReuseSourceUnAlignedFoldTwo<T, Trait, Binaryfunc>(dstAddr, src, loopANum, dimA, vlSize, dtypeSize, maskMain,
            maskTail, postUpdateStrideMain, postUpdateStrideTail);
    }

    // Reduce to 1
    for (uint16_t i = 0; i < foldZero; ++i) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), postUpdateStrideMain);
            MicroAPI::DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideMain);
        }
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
        MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), postUpdateStrideTail);
        MicroAPI::DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideTail);
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc>
__aicore__ inline void ReduceRAReuseSourceUnAlignedB64(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint32_t mainR = ReduceOpInternal::CalculateMainR(dimR, false, vlSize);

    uint16_t folds = ReduceOpInternal::CalculateFolds(mainR);
    uint16_t avgFolds = ReduceOpInternal::BASE_FOLD_B64;
    uint16_t tailFolds = folds % avgFolds;
    uint16_t foldZero = (tailFolds == 0) ? 1 : 0;
    uint16_t foldOne = (tailFolds == ReduceOpInternal::FLOD_ONE) ? 1 : 0;
    uint16_t foldTwo = (tailFolds == ReduceOpInternal::FLOD_TWO) ? 1 : 0;

    // Process vlSize axisA each time
    uint32_t postUpdateStrideTail = dimA % vlSize;

    ReduceRAReuseSourceUnAlignedB64VF<T, Trait, Binaryfunc>(dstAddr, srcAddr, dimA,
        dimR, mainR, folds, avgFolds, foldZero, foldOne, foldTwo, postUpdateStrideTail);
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc>
__simd_vf__ inline void ReduceRAReuseSourceUnAlignedVF(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR, uint32_t mainR, uint16_t folds, uint16_t avgFolds, uint16_t foldZero, uint16_t foldOne,
    uint16_t foldTwo, uint16_t foldThree, uint32_t postUpdateStrideTail)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint32_t tailR = dimR - mainR;
    uint16_t needInplaceAdd = tailR > 0 ? 1 : 0;
    uint16_t base = mainR;
    uint16_t mainTimes = folds / avgFolds;
    uint16_t loopANum = (dimA + vlSize - 1) / vlSize;
    uint32_t dtypeSize = sizeof(T);
    uint32_t postUpdateStrideMain = vlSize;
    uint32_t tailA = postUpdateStrideTail;
    
    MicroAPI::RegTensor<T, Trait> vregMain;
    MicroAPI::RegTensor<T, Trait> vregTail;
    MicroAPI::UnalignReg vUregMain;
    MicroAPI::UnalignReg vUregTail;
    MicroAPI::UnalignReg uDst;
    MicroAPI::MaskReg maskMain = MicroAPI::CreateMask<T, MicroAPI::MaskPattern::ALL, Trait>();
    MicroAPI::MaskReg maskTail = MicroAPI::UpdateMask<T, Trait>(tailA);

    for (uint16_t i = 0; i < needInplaceAdd; i++) {
        for (uint16_t loopR = 0; loopR < static_cast<uint16_t>(tailR); loopR++) {
            uint64_t hoistMainAddr = (uint64_t)srcAddr + static_cast<uint64_t>(loopR * dimA * dtypeSize);
            uint64_t hoistTailAddr = hoistMainAddr + static_cast<uint64_t>(mainR * dimA * dtypeSize);
            uint64_t hoistMainReuseAddr = hoistMainAddr;
            MicroAPI::DataCopyUnAlignPre(vUregMain, ((__ubuf__ T *&)hoistMainAddr));
            MicroAPI::DataCopyUnAlignPre(vUregTail, ((__ubuf__ T *&)hoistTailAddr));
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
                MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr),
                    postUpdateStrideMain);
                MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr),
                    postUpdateStrideMain);
                Binaryfunc(vregMain, vregMain, vregTail, maskMain);
                MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst,
                    postUpdateStrideMain);
            }
            MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr), postUpdateStrideTail);
            MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr), postUpdateStrideTail);
            Binaryfunc(vregMain, vregMain, vregTail, maskTail);
            MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst, postUpdateStrideTail);
            MicroAPI::DataCopyUnAlignPost(((__ubuf__ T *&)hoistMainReuseAddr), uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // MainFolds need 16 register
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::RegTensor<T, Trait> vreg2;
    MicroAPI::RegTensor<T, Trait> vreg3;
    MicroAPI::RegTensor<T, Trait> vreg4;
    MicroAPI::RegTensor<T, Trait> vreg5;
    MicroAPI::RegTensor<T, Trait> vreg6;
    MicroAPI::RegTensor<T, Trait> vreg7;
    MicroAPI::RegTensor<T, Trait> vreg8;
    MicroAPI::RegTensor<T, Trait> vreg9;
    MicroAPI::RegTensor<T, Trait> vreg10;
    MicroAPI::RegTensor<T, Trait> vreg11;
    MicroAPI::RegTensor<T, Trait> vreg12;
    MicroAPI::RegTensor<T, Trait> vreg13;
    MicroAPI::RegTensor<T, Trait> vreg14;
    MicroAPI::RegTensor<T, Trait> vreg15;
    MicroAPI::UnalignReg uSrc;

    // Process main folds
    uint16_t loopRNum = base;
    uint64_t src = 0;
    uint64_t dst = 0;
    uint64_t newSrc = 0;
    uint64_t newSrcTail = 0;
    for (uint16_t loopMain = 0; loopMain < mainTimes; loopMain++) {
        loopRNum = loopRNum >> avgFolds;
        uint16_t offsetR = loopRNum * dimA;
        for (uint16_t loopR = 0; loopR < loopRNum; loopR++) {
            src = (uint64_t)srcAddr + static_cast<uint64_t>(loopR * dimA * dtypeSize);
            newSrcTail = src + static_cast<uint64_t>((loopANum - 1) * vlSize * dtypeSize);
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
                newSrc = src + static_cast<uint64_t>(loopA * vlSize * dtypeSize);
                dst = newSrc;
                // L0
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg8, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg9, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg10, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg11, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg12, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg13, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg14, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg15, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                // L1
                Binaryfunc(vreg0, vreg0, vreg8, maskMain);
                Binaryfunc(vreg1, vreg1, vreg9, maskMain);
                Binaryfunc(vreg2, vreg2, vreg10, maskMain);
                Binaryfunc(vreg3, vreg3, vreg11, maskMain);
                Binaryfunc(vreg4, vreg4, vreg12, maskMain);
                Binaryfunc(vreg5, vreg5, vreg13, maskMain);
                Binaryfunc(vreg6, vreg6, vreg14, maskMain);
                Binaryfunc(vreg7, vreg7, vreg15, maskMain);
                // L2
                Binaryfunc(vreg0, vreg0, vreg4, maskMain);
                Binaryfunc(vreg1, vreg1, vreg5, maskMain);
                Binaryfunc(vreg2, vreg2, vreg6, maskMain);
                Binaryfunc(vreg3, vreg3, vreg7, maskMain);
                // L3
                Binaryfunc(vreg0, vreg0, vreg2, maskMain);
                Binaryfunc(vreg1, vreg1, vreg3, maskMain);
                // L4
                Binaryfunc(vreg0, vreg0, vreg1, maskMain);
                DataCopyUnAlign((__ubuf__ T *&)dst, vreg0, uDst, postUpdateStrideMain);
            }
            dst = newSrcTail;
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg8, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg9, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg10, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg11, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg12, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg13, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg14, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrcTail));
            MicroAPI::DataCopyUnAlign(vreg15, uSrc, ((__ubuf__ T *&)newSrcTail), offsetR);
            // L1
            Binaryfunc(vreg0, vreg0, vreg8, maskTail);
            Binaryfunc(vreg1, vreg1, vreg9, maskTail);
            Binaryfunc(vreg2, vreg2, vreg10, maskTail);
            Binaryfunc(vreg3, vreg3, vreg11, maskTail);
            Binaryfunc(vreg4, vreg4, vreg12, maskTail);
            Binaryfunc(vreg5, vreg5, vreg13, maskTail);
            Binaryfunc(vreg6, vreg6, vreg14, maskTail);
            Binaryfunc(vreg7, vreg7, vreg15, maskTail);
            // L2
            Binaryfunc(vreg0, vreg0, vreg4, maskTail);
            Binaryfunc(vreg1, vreg1, vreg5, maskTail);
            Binaryfunc(vreg2, vreg2, vreg6, maskTail);
            Binaryfunc(vreg3, vreg3, vreg7, maskTail);
            // L3
            Binaryfunc(vreg0, vreg0, vreg2, maskTail);
            Binaryfunc(vreg1, vreg1, vreg3, maskTail);
            // L4
            Binaryfunc(vreg0, vreg0, vreg1, maskTail);
            DataCopyUnAlign((__ubuf__ T *&)dst, vreg0, uDst, postUpdateStrideTail);
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dst, uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // Process tail folds
    src = (uint64_t)srcAddr;
    for (uint16_t i = 0; i < foldOne; i++) {
        ReduceRAReuseSourceUnAlignedFoldOne<T, Trait, Binaryfunc>(dstAddr, src, loopANum, dimA, vlSize, dtypeSize, maskMain,
            maskTail, postUpdateStrideMain, postUpdateStrideTail);
    }

    for (uint16_t i = 0; i < foldTwo; i++) {
        ReduceRAReuseSourceUnAlignedFoldTwo<T, Trait, Binaryfunc>(dstAddr, src, loopANum, dimA, vlSize, dtypeSize, maskMain,
            maskTail, postUpdateStrideMain, postUpdateStrideTail);
    }

    for (uint16_t i = 0; i < foldThree; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
            newSrc = src + static_cast<uint64_t>(loopA * vlSize * dtypeSize);
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), dimA);
            // L1
            Binaryfunc(vreg0, vreg0, vreg4, maskMain);
            Binaryfunc(vreg1, vreg1, vreg5, maskMain);
            Binaryfunc(vreg2, vreg2, vreg6, maskMain);
            Binaryfunc(vreg3, vreg3, vreg7, maskMain);
            // L2
            Binaryfunc(vreg0, vreg0, vreg2, maskMain);
            Binaryfunc(vreg1, vreg1, vreg3, maskMain);
            // L3
            Binaryfunc(vreg0, vreg0, vreg1, maskMain);
            DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideMain);
        }
        newSrc = src + static_cast<uint64_t>((loopANum - 1) * vlSize * dtypeSize);
        // L0
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), dimA);
        // L1
        Binaryfunc(vreg0, vreg0, vreg4, maskTail);
        Binaryfunc(vreg1, vreg1, vreg5, maskTail);
        Binaryfunc(vreg2, vreg2, vreg6, maskTail);
        Binaryfunc(vreg3, vreg3, vreg7, maskTail);
        // L2
        Binaryfunc(vreg0, vreg0, vreg2, maskTail);
        Binaryfunc(vreg1, vreg1, vreg3, maskTail);
        // L3
        Binaryfunc(vreg0, vreg0, vreg1, maskTail);
        DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideTail);
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }

    // Reduce to 1
    for (uint16_t i = 0; i < foldZero; ++i) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(loopANum - 1); loopA++) {
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), postUpdateStrideMain);
            MicroAPI::DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideMain);
        }
        MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
        MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), postUpdateStrideTail);
        MicroAPI::DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg0, uDst, postUpdateStrideTail);
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc>
__aicore__ inline void ReduceRAReuseSourceUnAligned(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint32_t mainR = ReduceOpInternal::CalculateMainR(dimR, false, vlSize);
    uint16_t folds = ReduceOpInternal::CalculateFolds(mainR);
    uint16_t avgFolds = ReduceOpInternal::BASE_FOLD;
    uint16_t tailFolds = folds % avgFolds;
    uint16_t foldZero = (tailFolds == 0) ? 1 : 0;
    uint16_t foldOne = (tailFolds == ReduceOpInternal::FLOD_ONE) ? 1 : 0;
    uint16_t foldTwo = (tailFolds == ReduceOpInternal::FLOD_TWO) ? 1 : 0;
    uint16_t foldThree = (tailFolds == ReduceOpInternal::FLOD_THREE) ? 1 : 0;
    // Process vlSize axisA each time
    uint32_t postUpdateStrideTail = dimA % vlSize;
    ReduceRAReuseSourceUnAlignedVF<T, Trait, Binaryfunc>(dstAddr, srcAddr, dimA,
        dimR, mainR, folds, avgFolds, foldZero, foldOne, foldTwo, foldThree, postUpdateStrideTail);
}

template <class T, class U, const MicroAPI::RegTrait &Trait, const MicroAPI::CastTrait &CastTraitUppper,
    const MicroAPI::CastTrait &CastTraitLower, const uint16_t vlSize, auto Binaryfunc, auto Reducefunc>
__simd_callee__ inline void ReduceARCastfoldZeroUnAligned(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR, MicroAPI::MaskReg &fullMask)
{
    MicroAPI::RegTensor<U, Trait> vreg0Cast;
    MicroAPI::RegTensor<U, Trait> vreg1Cast;
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::UnalignReg uSrc0;
    MicroAPI::UnalignReg uSrc1;
    MicroAPI::UnalignReg uDst;
    for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
        uint64_t newSrc = (uint64_t)srcAddr + vlSize / 2 * sizeof(T);
        MicroAPI::DataCopyUnAlignPre(uSrc0, ((__ubuf__ T *&)srcAddr));
        MicroAPI::DataCopyUnAlignPre(uSrc1, ((__ubuf__ T *&)newSrc));
        MicroAPI::DataCopyUnAlign(vreg0, uSrc0, ((__ubuf__ T *&)srcAddr), dimR);
        MicroAPI::DataCopyUnAlign(vreg1, uSrc1, ((__ubuf__ T *&)newSrc), dimR);
        Binaryfunc(vreg0, vreg0, vreg1, fullMask);
        if constexpr (IsSameType<T, bfloat16_t>::value) {
            MicroAPI::UnPack((MicroAPI::RegTensor<uint32_t, Trait> &)vreg0,
                (MicroAPI::RegTensor<uint16_t, Trait> &)vreg0);
        } else {
            MicroAPI::UnPack((MicroAPI::RegTensor<uint16_t, Trait> &)vreg0,
                (MicroAPI::RegTensor<uint8_t, Trait> &)vreg0);
        }
        if constexpr (IsSameType<T, bfloat16_t>::value) {
            MicroAPI::Cast<U, T, ReduceOpInternal::CastTraitBF16F32>(vreg0Cast, vreg0, fullMask);
            Reducefunc(vreg1Cast, vreg0Cast, fullMask);
            MicroAPI::Cast<T, U, ReduceOpInternal::CastTraitF32BF16>(vreg1, vreg1Cast, fullMask);
        } else {
            MicroAPI::Cast<U, T, ReduceOpInternal::CastTraitB8F16>(vreg0Cast, vreg0, fullMask);
            Reducefunc(vreg1Cast, vreg0Cast, fullMask);
            MicroAPI::Cast<T, U, ReduceOpInternal::CastTraitF16B8>(vreg1, vreg1Cast, fullMask);
        }
        DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
    }
    MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc, auto Reducefunc>
__simd_vf__ inline void ReduceAROverVLReuseSourceUnAlignedB64VF(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR, uint32_t mainR, uint32_t tailR, uint16_t base, uint16_t folds, uint16_t avgFolds, uint16_t foldZero, uint16_t foldOne,
    uint16_t foldTwo, uint32_t postUpdateStrideTail)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint16_t needInplaceAdd = tailR > 0 ? 1 : 0;
    uint16_t inplaceRepeats = (tailR + vlSize - 1) / vlSize;
    uint32_t dtypeSize = sizeof(T);
    uint16_t mainTimes = folds / avgFolds;
    uint32_t postUpdateStrideMain = vlSize;
    uint32_t tailA = postUpdateStrideTail;
    
    MicroAPI::RegTensor<T, Trait> vregMain;
    MicroAPI::RegTensor<T, Trait> vregTail;
    MicroAPI::UnalignReg vUregMain;
    MicroAPI::UnalignReg vUregTail;
    MicroAPI::UnalignReg uDst;
    MicroAPI::MaskReg fullMask = MicroAPI::CreateMask<T, MicroAPI::MaskPattern::ALL, Trait>();
    MicroAPI::MaskReg maskTail = MicroAPI::UpdateMask<T, Trait>(tailA);
    for (uint16_t i = 0; i < needInplaceAdd; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            uint64_t hoistMainAddr = (uint64_t)srcAddr + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            uint64_t hoistTailAddr = hoistMainAddr + static_cast<uint64_t>(mainR * dtypeSize);
            uint64_t hoistMainReuseAddr = hoistMainAddr;
            MicroAPI::DataCopyUnAlignPre(vUregMain, ((__ubuf__ T *&)hoistMainAddr));
            MicroAPI::DataCopyUnAlignPre(vUregTail, ((__ubuf__ T *&)hoistTailAddr));
            for (uint16_t loopR = 0; loopR < static_cast<uint16_t>(inplaceRepeats - 1); loopR++) {
                MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr), vlSize);
                MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr), vlSize);
                Binaryfunc(vregMain, vregMain, vregTail, fullMask);
                MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst,
                    postUpdateStrideMain);
                MicroAPI::DataCopyUnAlignPost(((__ubuf__ T *&)hoistMainReuseAddr), uDst, 0);
            }
            MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr), vlSize);
            MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr), vlSize);
            Binaryfunc(vregMain, vregMain, vregTail, maskTail);
            MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst, postUpdateStrideTail);
            MicroAPI::DataCopyUnAlignPost(((__ubuf__ T *&)hoistMainReuseAddr), uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // MainFolds need 16 register
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::RegTensor<T, Trait> vreg2;
    MicroAPI::RegTensor<T, Trait> vreg3;
    MicroAPI::RegTensor<T, Trait> vreg4;
    MicroAPI::RegTensor<T, Trait> vreg5;
    MicroAPI::RegTensor<T, Trait> vreg6;
    MicroAPI::RegTensor<T, Trait> vreg7;
    MicroAPI::UnalignReg uSrc;

    // Process main folds
    uint16_t loopRNum = base;
    uint64_t src = 0;
    uint64_t dst = 0;
    uint64_t newSrc = 0;
    for (uint16_t loopMain = 0; loopMain < mainTimes; loopMain++) {
        loopRNum = loopRNum >> avgFolds;
        uint16_t offsetR = loopRNum * vlSize;

        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            src = (uint64_t)srcAddr + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            dst = src;
            for (uint16_t loopR = 0; loopR < loopRNum; loopR++) {
                newSrc = src + static_cast<uint64_t>(loopR * vlSize * dtypeSize);
                // L0
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                // L1
                Binaryfunc(vreg0, vreg0, vreg4, fullMask);
                Binaryfunc(vreg1, vreg1, vreg5, fullMask);
                Binaryfunc(vreg2, vreg2, vreg6, fullMask);
                Binaryfunc(vreg3, vreg3, vreg7, fullMask);
                // L2
                Binaryfunc(vreg0, vreg0, vreg2, fullMask);
                Binaryfunc(vreg1, vreg1, vreg3, fullMask);
                // L3
                Binaryfunc(vreg0, vreg0, vreg1, fullMask);
                DataCopyUnAlign((__ubuf__ T *&)dst, vreg0, uDst, vlSize);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dst, uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // Process tail folds
    src = (uint64_t)srcAddr;
    for (uint16_t i = 0; i < foldOne; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            // L1
            Binaryfunc(vreg0, vreg0, vreg1, fullMask);
            if constexpr (IsSameType<T, bfloat16_t>::value) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                    ReduceOpInternal::CastTraitF32BF16, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else if constexpr (SupportBytes<T, 1>()) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                    ReduceOpInternal::CastTraitF16B8, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else {
                Reducefunc(vreg2, vreg0, fullMask);
            }
            DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg2, uDst, 1);
        }
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }

    for (uint16_t i = 0; i < foldTwo; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            // L1
            Binaryfunc(vreg0, vreg0, vreg2, fullMask);
            Binaryfunc(vreg1, vreg1, vreg3, fullMask);
            // L2
            Binaryfunc(vreg0, vreg0, vreg1, fullMask);
            if constexpr (IsSameType<T, bfloat16_t>::value) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                    ReduceOpInternal::CastTraitF32BF16, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else if constexpr (SupportBytes<T, 1>()) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                    ReduceOpInternal::CastTraitF16B8, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else {
                Reducefunc(vreg2, vreg0, fullMask);
            }
            DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg2, uDst, 1);
        }
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }

    // Reduce to 1
    uint32_t sreg1 = mainR;
    for (uint16_t i = 0; i < foldZero; i++) {
        if constexpr (IsSameType<T, bfloat16_t>::value) {
            ReduceARCastfoldZeroUnAligned<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                ReduceOpInternal::CastTraitF32BF16, vlSize, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR,
                fullMask);
        } else if constexpr (SupportBytes<T, 1>()) {
            ReduceARCastfoldZeroUnAligned<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                ReduceOpInternal::CastTraitF16B8, vlSize, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR,
                fullMask);
        } else {
            MicroAPI::MaskReg mask = MicroAPI::UpdateMask<T, Trait>(sreg1);
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
                newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
                Reducefunc(vreg1, vreg0, mask);
                MicroAPI::DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
        }
    }
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc, auto Reducefunc>
__aicore__ inline void ReduceAROverVLReuseSourceUnAlignedB64(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint32_t mainR = ReduceOpInternal::CalculateMainR(dimR, true, vlSize);
    uint32_t tailR = dimR - mainR;
    uint16_t base = mainR / vlSize;
    uint16_t folds = ReduceOpInternal::CalculateFolds(base);
    uint16_t avgFolds = ReduceOpInternal::BASE_FOLD_B64;
    uint16_t tailFolds = folds % avgFolds;
    uint16_t foldZero = (tailFolds == 0) ? 1 : 0;
    uint16_t foldOne = (tailFolds == ReduceOpInternal::FLOD_ONE) ? 1 : 0;
    uint16_t foldTwo = (tailFolds == ReduceOpInternal::FLOD_TWO) ? 1 : 0;
    uint32_t postUpdateStrideTail = tailR % vlSize;
    ReduceAROverVLReuseSourceUnAlignedB64VF<T, Trait, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA,
        dimR, mainR, tailR, base, folds, avgFolds, foldZero, foldOne, foldTwo, postUpdateStrideTail);
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc, auto Reducefunc>
__simd_vf__ inline void ReduceAROverVLReuseSourceUnAlignedVF(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR, uint32_t mainR, uint32_t tailR, uint16_t base, uint16_t folds, uint16_t avgFolds, uint16_t foldZero,
    uint16_t foldOne, uint16_t foldTwo, uint16_t foldThree, uint32_t postUpdateStrideTail)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint16_t needInplaceAdd = tailR > 0 ? 1 : 0;
    uint16_t inplaceRepeats = (tailR + vlSize - 1) / vlSize;
    uint32_t dtypeSize = sizeof(T);
    uint16_t mainTimes = folds / avgFolds;
    uint32_t postUpdateStrideMain = vlSize;
    uint32_t tailA = postUpdateStrideTail;
    
    MicroAPI::RegTensor<T, Trait> vregMain;
    MicroAPI::RegTensor<T, Trait> vregTail;
    MicroAPI::UnalignReg vUregMain;
    MicroAPI::UnalignReg vUregTail;
    MicroAPI::UnalignReg uDst;
    MicroAPI::MaskReg fullMask = MicroAPI::CreateMask<T, MicroAPI::MaskPattern::ALL, Trait>();
    MicroAPI::MaskReg maskTail = MicroAPI::UpdateMask<T, Trait>(tailA);
    for (uint16_t i = 0; i < needInplaceAdd; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            uint64_t hoistMainAddr = (uint64_t)srcAddr + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            uint64_t hoistTailAddr = hoistMainAddr + static_cast<uint64_t>(mainR * dtypeSize);
            uint64_t hoistMainReuseAddr = hoistMainAddr;
            MicroAPI::DataCopyUnAlignPre(vUregMain, ((__ubuf__ T *&)hoistMainAddr));
            MicroAPI::DataCopyUnAlignPre(vUregTail, ((__ubuf__ T *&)hoistTailAddr));
            for (uint16_t loopR = 0; loopR < static_cast<uint16_t>(inplaceRepeats - 1); loopR++) {
                MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr), vlSize);
                MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr), vlSize);
                Binaryfunc(vregMain, vregMain, vregTail, fullMask);
                MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst,
                    postUpdateStrideMain);
                MicroAPI::DataCopyUnAlignPost(((__ubuf__ T *&)hoistMainReuseAddr), uDst, 0);
            }
            MicroAPI::DataCopyUnAlign(vregMain, vUregMain, ((__ubuf__ T *&)hoistMainAddr), vlSize);
            MicroAPI::DataCopyUnAlign(vregTail, vUregTail, ((__ubuf__ T *&)hoistTailAddr), vlSize);
            Binaryfunc(vregMain, vregMain, vregTail, maskTail);
            MicroAPI::DataCopyUnAlign(((__ubuf__ T *&)hoistMainReuseAddr), vregMain, uDst, postUpdateStrideTail);
            MicroAPI::DataCopyUnAlignPost(((__ubuf__ T *&)hoistMainReuseAddr), uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // MainFolds need 16 register
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::RegTensor<T, Trait> vreg2;
    MicroAPI::RegTensor<T, Trait> vreg3;
    MicroAPI::RegTensor<T, Trait> vreg4;
    MicroAPI::RegTensor<T, Trait> vreg5;
    MicroAPI::RegTensor<T, Trait> vreg6;
    MicroAPI::RegTensor<T, Trait> vreg7;
    MicroAPI::RegTensor<T, Trait> vreg8;
    MicroAPI::RegTensor<T, Trait> vreg9;
    MicroAPI::RegTensor<T, Trait> vreg10;
    MicroAPI::RegTensor<T, Trait> vreg11;
    MicroAPI::RegTensor<T, Trait> vreg12;
    MicroAPI::RegTensor<T, Trait> vreg13;
    MicroAPI::RegTensor<T, Trait> vreg14;
    MicroAPI::RegTensor<T, Trait> vreg15;
    MicroAPI::UnalignReg uSrc;

    // Process main folds
    uint16_t loopRNum = base;
    uint64_t src = 0;
    uint64_t dst = 0;
    uint64_t newSrc = 0;
    for (uint16_t loopMain = 0; loopMain < mainTimes; loopMain++) {
        loopRNum = loopRNum >> avgFolds;
        uint16_t offsetR = loopRNum * vlSize;

        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            src = (uint64_t)srcAddr + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            dst = src;
            for (uint16_t loopR = 0; loopR < loopRNum; loopR++) {
                newSrc = src + static_cast<uint64_t>(loopR * vlSize * dtypeSize);
                // L0
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg8, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg9, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg10, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg11, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg12, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg13, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg14, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg15, uSrc, ((__ubuf__ T *&)newSrc), offsetR);
                // L1
                Binaryfunc(vreg0, vreg0, vreg8, fullMask);
                Binaryfunc(vreg1, vreg1, vreg9, fullMask);
                Binaryfunc(vreg2, vreg2, vreg10, fullMask);
                Binaryfunc(vreg3, vreg3, vreg11, fullMask);
                Binaryfunc(vreg4, vreg4, vreg12, fullMask);
                Binaryfunc(vreg5, vreg5, vreg13, fullMask);
                Binaryfunc(vreg6, vreg6, vreg14, fullMask);
                Binaryfunc(vreg7, vreg7, vreg15, fullMask);
                // L2
                Binaryfunc(vreg0, vreg0, vreg4, fullMask);
                Binaryfunc(vreg1, vreg1, vreg5, fullMask);
                Binaryfunc(vreg2, vreg2, vreg6, fullMask);
                Binaryfunc(vreg3, vreg3, vreg7, fullMask);
                // L2
                Binaryfunc(vreg0, vreg0, vreg2, fullMask);
                Binaryfunc(vreg1, vreg1, vreg3, fullMask);
                // L3
                Binaryfunc(vreg0, vreg0, vreg1, fullMask);
                DataCopyUnAlign((__ubuf__ T *&)dst, vreg0, uDst, vlSize);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dst, uDst, 0);
        }
        MicroAPI::LocalMemBar<MicroAPI::MemType::VEC_STORE, MicroAPI::MemType::VEC_LOAD>();
    }

    // Process tail folds
    src = (uint64_t)srcAddr;
    for (uint16_t i = 0; i < foldOne; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            // L1
            Binaryfunc(vreg0, vreg0, vreg1, fullMask);
            if constexpr (IsSameType<T, bfloat16_t>::value) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                    ReduceOpInternal::CastTraitF32BF16, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else if constexpr (SupportBytes<T, 1>()) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                    ReduceOpInternal::CastTraitF16B8, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else {
                Reducefunc(vreg2, vreg0, fullMask);
            }
            DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg2, uDst, 1);
        }
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }

    for (uint16_t i = 0; i < foldTwo; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            // L1
            Binaryfunc(vreg0, vreg0, vreg2, fullMask);
            Binaryfunc(vreg1, vreg1, vreg3, fullMask);
            // L2
            Binaryfunc(vreg0, vreg0, vreg1, fullMask);
            if constexpr (IsSameType<T, bfloat16_t>::value) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                    ReduceOpInternal::CastTraitF32BF16, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else if constexpr (SupportBytes<T, 1>()) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                    ReduceOpInternal::CastTraitF16B8, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else {
                Reducefunc(vreg2, vreg0, fullMask);
            }
            DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg2, uDst, 1);
        }
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }

    for (uint16_t i = 0; i < foldThree; i++) {
        for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
            newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
            // L0
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
            MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg1, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg2, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg3, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg4, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg5, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg6, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            MicroAPI::DataCopyUnAlign(vreg7, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
            // L1
            Binaryfunc(vreg0, vreg0, vreg4, fullMask);
            Binaryfunc(vreg1, vreg1, vreg5, fullMask);
            Binaryfunc(vreg2, vreg2, vreg6, fullMask);
            Binaryfunc(vreg3, vreg3, vreg7, fullMask);
            // L2
            Binaryfunc(vreg0, vreg0, vreg2, fullMask);
            Binaryfunc(vreg1, vreg1, vreg3, fullMask);
            // L3
            Binaryfunc(vreg0, vreg0, vreg1, fullMask);
            if constexpr (IsSameType<T, bfloat16_t>::value) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                    ReduceOpInternal::CastTraitF32BF16, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else if constexpr (SupportBytes<T, 1>()) {
                ReduceOpInternal::ReduceARCastfoldOneToThree<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                    ReduceOpInternal::CastTraitF16B8, Binaryfunc, Reducefunc>(vreg0, vreg2, fullMask);
            } else {
                Reducefunc(vreg2, vreg0, fullMask);
            }
            DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg2, uDst, 1);
        }
        MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
    }

    // Reduce to 1
    for (uint16_t i = 0; i < foldZero; i++) {
        if constexpr (IsSameType<T, bfloat16_t>::value) {
            ReduceARCastfoldZeroUnAligned<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                ReduceOpInternal::CastTraitF32BF16, vlSize, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR,
                fullMask);
        } else if constexpr (SupportBytes<T, 1>()) {
            ReduceARCastfoldZeroUnAligned<T, half, Trait, ReduceOpInternal::CastTraitB8F16,
                ReduceOpInternal::CastTraitF16B8, vlSize, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR,
                fullMask);
        } else {
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
                newSrc = src + static_cast<uint64_t>(loopA * dimR * dtypeSize);
                MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)newSrc), vlSize);
                Reducefunc(vreg1, vreg0, fullMask);
                MicroAPI::DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
        }
    }
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc, auto Reducefunc>
__aicore__ inline void ReduceAROverVLReuseSourceUnAligned(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    uint32_t mainR = ReduceOpInternal::CalculateMainR(dimR, true, vlSize);
    uint32_t tailR = dimR - mainR;
    uint16_t base = mainR / vlSize;
    uint16_t folds = ReduceOpInternal::CalculateFolds(base);
    uint16_t avgFolds = ReduceOpInternal::BASE_FOLD;
    uint16_t tailFolds = folds % avgFolds;
    uint16_t foldZero = (tailFolds == 0) ? 1 : 0;
    uint16_t foldOne = (tailFolds == ReduceOpInternal::FLOD_ONE) ? 1 : 0;
    uint16_t foldTwo = (tailFolds == ReduceOpInternal::FLOD_TWO) ? 1 : 0;
    uint16_t foldThree = (tailFolds == ReduceOpInternal::FLOD_THREE) ? 1 : 0;
    uint32_t postUpdateStrideTail = tailR % vlSize;
    ReduceAROverVLReuseSourceUnAlignedVF<T, Trait, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR,
        mainR, tailR, base, folds, avgFolds, foldZero, foldOne, foldTwo, foldThree, postUpdateStrideTail);
}

template <class T, class U, const MicroAPI::RegTrait &Trait, const MicroAPI::CastTrait &CastTraitUppper,
    const MicroAPI::CastTrait &CastTraitLower, const uint16_t vlSize, auto Binaryfunc, auto Reducefunc>
__simd_vf__ inline void CastReduceARUnAligned(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA, uint32_t dimR)
{
    uint32_t dtypeSize = sizeof(T);
    if (dimR <= (vlSize / 2)) {
        if constexpr (SupportBytes<T, 2>()) {
            MicroAPI::RegTensor<T, Trait> vreg0;
            MicroAPI::RegTensor<T, Trait> vreg1;
            MicroAPI::RegTensor<U, Trait> vreg0CastB32;
            MicroAPI::RegTensor<U, Trait> vreg1CastB32;
            MicroAPI::UnalignReg uSrc;
            MicroAPI::UnalignReg uDst;
            uint32_t sreg1 = dimR;
            MicroAPI::MaskReg mask = MicroAPI::UpdateMask<U>(sreg1);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), dimR);
                MicroAPI::UnPack((MicroAPI::RegTensor<uint32_t, Trait> &)vreg0,
                    (MicroAPI::RegTensor<uint16_t, Trait> &)vreg0);
                MicroAPI::Cast<U, T, CastTraitUppper>(vreg0CastB32, vreg0, mask);
                Reducefunc(vreg1CastB32, vreg0CastB32, mask);
                MicroAPI::Cast<T, U, CastTraitLower>(vreg1, vreg1CastB32, mask);
                DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
        } else if constexpr (SupportBytes<T, 1>()) {
            MicroAPI::RegTensor<T, Trait> vreg0;
            MicroAPI::RegTensor<T, Trait> vreg1;
            MicroAPI::RegTensor<U, Trait> vreg0CastB16;
            MicroAPI::RegTensor<U, Trait> vreg1CastB16;
            MicroAPI::UnalignReg uSrc;
            MicroAPI::UnalignReg uDst;
            uint32_t sreg1 = dimR;
            MicroAPI::MaskReg mask = MicroAPI::UpdateMask<U>(sreg1);
            MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
                MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), dimR);
                MicroAPI::UnPack((MicroAPI::RegTensor<uint16_t, Trait> &)vreg0,
                    (MicroAPI::RegTensor<uint8_t, Trait> &)vreg0);
                MicroAPI::Cast<U, T, CastTraitUppper>(vreg0CastB16, vreg0, mask);
                Reducefunc(vreg1CastB16, vreg0CastB16, mask);
                MicroAPI::Cast<T, U, CastTraitLower>(vreg1, vreg1CastB16, mask);
                DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
        }
    } else {
        if constexpr (SupportBytes<T, 2>()) {
            MicroAPI::RegTensor<T, Trait> vreg0;
            MicroAPI::RegTensor<T, Trait> vreg1;
            MicroAPI::RegTensor<T, Trait> vreg2;
            MicroAPI::RegTensor<U, Trait> vreg0CastB32;
            MicroAPI::RegTensor<U, Trait> vreg1CastB32;
            MicroAPI::UnalignReg uSrc0;
            MicroAPI::UnalignReg uSrc1;
            MicroAPI::UnalignReg uDst;
            uint32_t sreg1 = dimR;
            MicroAPI::MaskReg fullMask = MicroAPI::CreateMask<T, MicroAPI::MaskPattern::ALL, Trait>();
            MicroAPI::MaskReg mask = MicroAPI::UpdateMask<U>(sreg1);
            mask = MicroAPI::UpdateMask<U>(sreg1);
            MicroAPI::MaskPack(mask, mask);
            uint64_t newSrc = (uint64_t)srcAddr + vlSize / 2 * dtypeSize;
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
                MicroAPI::DataCopyUnAlignPre(uSrc0, ((__ubuf__ T *&)srcAddr));
                MicroAPI::DataCopyUnAlignPre(uSrc1, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc0, ((__ubuf__ T *&)srcAddr), dimR);
                MicroAPI::DataCopyUnAlign(vreg1, uSrc1, ((__ubuf__ T *&)newSrc), dimR);
                Binaryfunc(vreg2, vreg0, vreg1, mask);
                Select(vreg2, vreg2, vreg0, mask);
                MicroAPI::UnPack((MicroAPI::RegTensor<uint32_t, Trait> &)vreg2,
                    (MicroAPI::RegTensor<uint16_t, Trait> &)vreg2);
                MicroAPI::Cast<U, T, CastTraitUppper>(vreg0CastB32, vreg2, fullMask);
                Reducefunc(vreg1CastB32, vreg0CastB32, fullMask);
                MicroAPI::Cast<T, U, CastTraitLower>(vreg1, vreg1CastB32, fullMask);
                DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
        } else if constexpr (SupportBytes<T, 1>()) {
            MicroAPI::RegTensor<T, Trait> vreg0;
            MicroAPI::RegTensor<T, Trait> vreg1;
            MicroAPI::RegTensor<T, Trait> vreg2;
            MicroAPI::RegTensor<U, Trait> vreg0CastB16;
            MicroAPI::RegTensor<U, Trait> vreg1CastB16;
            MicroAPI::UnalignReg uSrc0;
            MicroAPI::UnalignReg uSrc1;
            MicroAPI::UnalignReg uDst;
            uint32_t sreg1 = dimR;
            MicroAPI::MaskReg fullMask = MicroAPI::CreateMask<T, MicroAPI::MaskPattern::ALL, Trait>();
            MicroAPI::MaskReg mask = MicroAPI::UpdateMask<U>(sreg1);
            mask = MicroAPI::UpdateMask<U>(sreg1);
            MicroAPI::MaskPack(mask, mask);
            uint64_t newSrc = (uint64_t)srcAddr + vlSize / 2 * dtypeSize;
            for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
                MicroAPI::DataCopyUnAlignPre(uSrc0, ((__ubuf__ T *&)srcAddr));
                MicroAPI::DataCopyUnAlignPre(uSrc1, ((__ubuf__ T *&)newSrc));
                MicroAPI::DataCopyUnAlign(vreg0, uSrc0, ((__ubuf__ T *&)srcAddr), dimR);
                MicroAPI::DataCopyUnAlign(vreg1, uSrc1, ((__ubuf__ T *&)newSrc), dimR);
                Binaryfunc(vreg2, vreg0, vreg1, mask);
                Select(vreg2, vreg2, vreg0, mask);
                MicroAPI::UnPack((MicroAPI::RegTensor<uint16_t, Trait> &)vreg2,
                    (MicroAPI::RegTensor<uint8_t, Trait> &)vreg2);
                MicroAPI::Cast<U, T, CastTraitUppper>(vreg0CastB16, vreg2, fullMask);
                Reducefunc(vreg1CastB16, vreg0CastB16, fullMask);
                MicroAPI::Cast<T, U, CastTraitLower>(vreg1, vreg1CastB16, fullMask);
                DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
            }
            MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
        }
    }
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc, auto Reducefunc>
__simd_vf__ inline void ReduceARReuseSourceUnAlignedVF(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR)
{
    MicroAPI::RegTensor<T, Trait> vreg0;
    MicroAPI::RegTensor<T, Trait> vreg1;
    MicroAPI::UnalignReg uSrc;
    MicroAPI::UnalignReg uDst;
    uint32_t sreg1 = dimR;
    MicroAPI::MaskReg mask = MicroAPI::UpdateMask<T, Trait>(sreg1);
    MicroAPI::DataCopyUnAlignPre(uSrc, ((__ubuf__ T *&)srcAddr));
    for (uint16_t loopA = 0; loopA < static_cast<uint16_t>(dimA); loopA++) {
        MicroAPI::DataCopyUnAlign(vreg0, uSrc, ((__ubuf__ T *&)srcAddr), dimR);
        Reducefunc(vreg1, vreg0, mask);
        DataCopyUnAlign((__ubuf__ T *&)dstAddr, vreg1, uDst, 1);
    }
    MicroAPI::DataCopyUnAlignPost((__ubuf__ T *&)dstAddr, uDst, 0);
}

template <class T, const MicroAPI::RegTrait &Trait, auto Binaryfunc, auto Reducefunc>
__aicore__ inline void ReduceARReuseSourceUnAligned(__ubuf__ T *dstAddr, __ubuf__ T *srcAddr, uint32_t dimA,
    uint32_t dimR)
{
    constexpr uint16_t vlSize = SupportBytes<T, 8>() ? GetVecLen() / sizeof(float) : GetVecLen() / sizeof(T);
    if (dimR < vlSize) {
        if constexpr (IsSameType<T, bfloat16_t>::value) {
            CastReduceARUnAligned<T, float, Trait, ReduceOpInternal::CastTraitBF16F32,
                ReduceOpInternal::CastTraitF32BF16, vlSize, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR);
        } else if constexpr (SupportBytes<T, 1>()) {
            CastReduceARUnAligned<T, half, Trait, ReduceOpInternal::CastTraitB8F16, ReduceOpInternal::CastTraitF16B8,
                vlSize, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR);
        } else {
            ReduceARReuseSourceUnAlignedVF<T, Trait, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR);
        }
    } else {
        if constexpr (SupportBytes<T, 8>()) {
            ReduceAROverVLReuseSourceUnAlignedB64<T, Trait, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR);
        } else {
            ReduceAROverVLReuseSourceUnAligned<T, Trait, Binaryfunc, Reducefunc>(dstAddr, srcAddr, dimA, dimR);
        }
    }
}
} // namespace AscendC
#endif // IMPL_REDUCE_REDUCE_COMMON_AR_RA_REUSE_UNALIGN_C310_IMPL_H